FIND ME A HOME

Clustering of Toronto’s neighborhoods based on Crime Rate, Local Employment, Childcare spaces and nearby Venues.


Author: Nitish Bhardwaj

Date Created: 2nd May 2020

Last Updated: 14th May 2020

In [1]:
#data manipulation imports
import pandas as pd
import numpy as np

#data scrapping imports
from bs4 import BeautifulSoup
import requests

#geocoding imports
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

#visualization libraries
import folium as folium
from folium.plugins import MarkerCluster
import branca  #A spinoff for Folium to support HTML+JS
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import plotly.graph_objects as go

#transform JSON file into a pandas dataframe
from pandas.io.json import json_normalize

# import sklearn packages
from sklearn.cluster import KMeans
from sklearn.preprocessing import power_transform, StandardScaler

#import stats to verify the skewness in the data
from scipy import stats

#metric imports
from sklearn.metrics import silhouette_score

#to supress warnings
import warnings
warnings.filterwarnings('ignore')
In [2]:
#Formatting the print statements
class style:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
In [3]:
#This data is fetched from https://open.toronto.ca/dataset/neighbourhoods/
df_geoDetails=pd.read_csv("Neighbourhoods.csv")
df_geoDetails.head()
Out[3]:
_id AREA_ID AREA_ATTR_ID PARENT_AREA_ID AREA_SHORT_CODE AREA_LONG_CODE AREA_NAME AREA_DESC X Y LONGITUDE LATITUDE OBJECTID Shape__Area Shape__Length geometry
0 4901 25886861 25926662 49885 94 94 Wychwood (94) Wychwood (94) NaN NaN -79.425515 43.676919 16491505 3.217960e+06 7515.779658 {u'type': u'Polygon', u'coordinates': (((-79.4...
1 4902 25886820 25926663 49885 100 100 Yonge-Eglinton (100) Yonge-Eglinton (100) NaN NaN -79.403590 43.704689 16491521 3.160334e+06 7872.021074 {u'type': u'Polygon', u'coordinates': (((-79.4...
2 4903 25886834 25926664 49885 97 97 Yonge-St.Clair (97) Yonge-St.Clair (97) NaN NaN -79.397871 43.687859 16491537 2.222464e+06 8130.411276 {u'type': u'Polygon', u'coordinates': (((-79.3...
3 4904 25886593 25926665 49885 27 27 York University Heights (27) York University Heights (27) NaN NaN -79.488883 43.765736 16491553 2.541821e+07 25632.335242 {u'type': u'Polygon', u'coordinates': (((-79.5...
4 4905 25886688 25926666 49885 31 31 Yorkdale-Glen Park (31) Yorkdale-Glen Park (31) NaN NaN -79.457108 43.714672 16491569 1.156669e+07 13953.408098 {u'type': u'Polygon', u'coordinates': (((-79.4...
In [4]:
df_geoDetails = df_geoDetails[['AREA_NAME', 'AREA_SHORT_CODE', 'LONGITUDE', 'LATITUDE']]
df_geoDetails['AREA_NAME'] = df_geoDetails['AREA_NAME'].str.replace(r"\s*\([^()]*\)","").str.strip()
df_geoDetails.rename(columns={"AREA_NAME": "Neighbourhood", "AREA_SHORT_CODE": "Neighbourhood_Id" }, inplace=True)
df_geoDetails.sort_values(by=['Neighbourhood_Id']).head()
Out[4]:
Neighbourhood Neighbourhood_Id LONGITUDE LATITUDE
63 West Humber-Clairville 1 -79.596356 43.716180
20 Mount Olive-Silverstone-Jamestown 2 -79.587259 43.746868
56 Thistletown-Beaumond Heights 3 -79.563491 43.737988
40 Rexdale-Kipling 4 -79.566228 43.723725
112 Elms-Old Rexdale 5 -79.548983 43.721519
In [5]:
#This data is fetched from https://open.toronto.ca/dataset/wellbeing-toronto-economics/
df_economics = pd.read_excel("wellbeing-toronto-economics.xlsx")
df_economics.rename(columns={"Neighbourhood Id": "Neighbourhood_Id" }, inplace=True)
df_economics.head()
Out[5]:
Neighbourhood Neighbourhood_Id Businesses Child Care Spaces Debt Risk Score Home Prices Local Employment Social Assistance Recipients
0 West Humber-Clairville 1 2463 195 719 317508 58271 2912
1 Mount Olive-Silverstone-Jamestown 2 271 60 687 251119 3244 6561
2 Thistletown-Beaumond Heights 3 217 25 718 414216 1311 1276
3 Rexdale-Kipling 4 144 75 721 392271 1178 1323
4 Elms-Old Rexdale 5 67 60 692 233832 903 1683
In [6]:
df_economics = df_economics[['Neighbourhood_Id','Home Prices', 'Child Care Spaces','Local Employment']]
df = pd.merge(df_geoDetails.assign(Neighbourhood_Id=df_geoDetails.Neighbourhood_Id.astype(str)), 
             df_economics.assign(Neighbourhood_Id=df_economics.Neighbourhood_Id.astype(str)), 
             on='Neighbourhood_Id')
df.head()
Out[6]:
Neighbourhood Neighbourhood_Id LONGITUDE LATITUDE Home Prices Child Care Spaces Local Employment
0 Wychwood 94 -79.425515 43.676919 656868 84 5143
1 Yonge-Eglinton 100 -79.403590 43.704689 975449 45 11746
2 Yonge-St.Clair 97 -79.397871 43.687859 995616 20 7858
3 York University Heights 27 -79.488883 43.765736 359372 156 42885
4 Yorkdale-Glen Park 31 -79.457108 43.714672 421045 82 24685
In [7]:
#This data is fetched from https://open.toronto.ca/dataset/wellbeing-toronto-safety/
df_crime = pd.read_excel("wellbeing-toronto-safety.xlsx")
df_crime.rename(columns={"Neighbourhood Id": "Neighbourhood_Id" }, inplace=True)
df_crime.head()
Out[7]:
Neighbourhood Neighbourhood_Id Arsons Assaults Break & Enters Drug Arrests Fire Medical Calls Fire Vehicle Incidents Fires & Fire Alarms Hazardous Incidents Murders Robberies Sexual Assaults Thefts Total Major Crime Incidents Vehicle Thefts
0 West Humber-Clairville 1 4 390 175 62 1321 502 705 210 0 82 68 54 1119 288
1 Mount Olive-Silverstone-Jamestown 2 3 316 61 90 1016 59 361 176 1 78 75 7 690 62
2 Thistletown-Beaumond Heights 3 0 85 36 16 323 48 90 34 0 17 24 2 192 12
3 Rexdale-Kipling 4 0 59 32 15 305 34 94 55 1 16 20 3 164 18
4 Elms-Old Rexdale 5 1 77 25 14 321 71 107 43 0 23 5 19 185 22
In [8]:
df_crime.drop(columns=['Neighbourhood'], inplace=True)
df = pd.merge(df.assign(Neighbourhood_Id=df.Neighbourhood_Id.astype(str)), 
             df_crime.assign(Neighbourhood_Id=df_crime.Neighbourhood_Id.astype(str)), 
             on='Neighbourhood_Id')
df.head()
Out[8]:
Neighbourhood Neighbourhood_Id LONGITUDE LATITUDE Home Prices Child Care Spaces Local Employment Arsons Assaults Break & Enters ... Fire Medical Calls Fire Vehicle Incidents Fires & Fire Alarms Hazardous Incidents Murders Robberies Sexual Assaults Thefts Total Major Crime Incidents Vehicle Thefts
0 Wychwood 94 -79.425515 43.676919 656868 84 5143 1 91 58 ... 507 56 140 96 0 23 13 5 220 15
1 Yonge-Eglinton 100 -79.403590 43.704689 975449 45 11746 0 69 60 ... 307 34 147 115 0 33 9 1 229 14
2 Yonge-St.Clair 97 -79.397871 43.687859 995616 20 7858 0 27 62 ... 312 28 175 67 0 3 5 6 111 8
3 York University Heights 27 -79.488883 43.765736 359372 156 42885 2 282 150 ... 1007 143 798 186 2 99 49 16 776 122
4 Yorkdale-Glen Park 31 -79.457108 43.714672 421045 82 24685 1 181 67 ... 812 174 286 128 0 81 11 14 551 137

5 rows × 21 columns

In [9]:
df.shape
Out[9]:
(140, 21)



Section 2

Exploratory Data Analysis (EDA)

In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 139
Data columns (total 21 columns):
Neighbourhood                  140 non-null object
Neighbourhood_Id               140 non-null object
LONGITUDE                      140 non-null float64
LATITUDE                       140 non-null float64
Home Prices                    140 non-null int64
Child Care Spaces              140 non-null int64
Local Employment               140 non-null int64
Arsons                         140 non-null int64
Assaults                       140 non-null int64
Break & Enters                 140 non-null int64
Drug Arrests                   140 non-null int64
Fire Medical Calls             140 non-null int64
Fire Vehicle Incidents         140 non-null int64
Fires & Fire Alarms            140 non-null int64
Hazardous Incidents            140 non-null int64
Murders                        140 non-null int64
Robberies                      140 non-null int64
Sexual Assaults                140 non-null int64
Thefts                         140 non-null int64
Total Major Crime Incidents    140 non-null int64
Vehicle Thefts                 140 non-null int64
dtypes: float64(2), int64(17), object(2)
memory usage: 24.1+ KB

Observation:

Neighbourhood and Neighbourhood_Id are object types. Let's keep a note of it. Based on the requirment, we will change the data type.

In [11]:
print(style.BOLD+style.UNDERLINE+"Verify the number of missing values in the dataset:"+style.END)
df.isnull().sum()
Verify the number of missing values in the dataset:
Out[11]:
Neighbourhood                  0
Neighbourhood_Id               0
LONGITUDE                      0
LATITUDE                       0
Home Prices                    0
Child Care Spaces              0
Local Employment               0
Arsons                         0
Assaults                       0
Break & Enters                 0
Drug Arrests                   0
Fire Medical Calls             0
Fire Vehicle Incidents         0
Fires & Fire Alarms            0
Hazardous Incidents            0
Murders                        0
Robberies                      0
Sexual Assaults                0
Thefts                         0
Total Major Crime Incidents    0
Vehicle Thefts                 0
dtype: int64

Observation:

No null values or missing values are found in the dataset.

In [12]:
df.describe()
Out[12]:
LONGITUDE LATITUDE Home Prices Child Care Spaces Local Employment Arsons Assaults Break & Enters Drug Arrests Fire Medical Calls Fire Vehicle Incidents Fires & Fire Alarms Hazardous Incidents Murders Robberies Sexual Assaults Thefts Total Major Crime Incidents Vehicle Thefts
count 140.000000 140.000000 1.400000e+02 140.000000 140.0000 140.000000 140.000000 140.00000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000
mean -79.400186 43.708841 5.481934e+05 124.685714 9409.3500 1.114286 150.621429 75.25000 36.542857 632.200000 88.200000 240.328571 108.228571 0.350000 34.442857 17.621429 6.478571 351.128571 29.821429
std 0.102044 0.051274 2.676674e+05 81.603541 19125.3383 1.269939 121.538369 42.15595 39.964980 404.120667 76.219316 198.576714 55.022342 0.622423 26.352145 14.641258 6.748800 235.957009 34.029033
min -79.596356 43.592362 2.041040e+05 0.000000 438.0000 0.000000 16.000000 12.00000 0.000000 207.000000 3.000000 45.000000 26.000000 0.000000 3.000000 1.000000 0.000000 81.000000 3.000000
25% -79.479794 43.671009 3.749645e+05 60.000000 2069.5000 0.000000 69.750000 44.75000 13.000000 343.750000 40.000000 119.000000 64.000000 0.000000 16.000000 8.000000 2.750000 185.000000 13.000000
50% -79.403989 43.702021 4.912100e+05 109.500000 4052.5000 1.000000 124.000000 63.00000 25.000000 509.500000 66.000000 182.000000 97.500000 0.000000 28.000000 14.000000 5.000000 294.000000 21.000000
75% -79.331097 43.747294 5.902160e+05 176.000000 10127.0000 2.000000 184.000000 99.00000 46.250000 788.250000 111.500000 277.750000 135.750000 1.000000 44.000000 21.250000 8.000000 412.500000 35.000000
max -79.150843 43.821201 1.849084e+06 441.000000 185891.0000 6.000000 892.000000 219.00000 302.000000 2631.000000 502.000000 1203.000000 272.000000 3.000000 124.000000 75.000000 54.000000 1393.000000 288.000000
Visualize the number of crime incidents in Toronto
In [13]:
#Create a temp dataframe to have a sum of total major crimes
# crime_temp_col = list(df.columns[8:11]) + list(df.columns[14:-2]) + [df.columns[-1]]
crime_temp_col = list(df.columns[8:-2])
crime_temp = df[crime_temp_col].sum().sort_values(ascending=False)
In [14]:
#Visualization for major crimes in Toronto
fig = go.Figure(go.Bar(x=crime_temp.index, y=crime_temp.values,text=crime_temp.values, 
                       textposition='outside',
                       hovertemplate = "%{x}: %{y} </br>", name='', marker_color='rgb(55, 83, 109)'))

fig.update_layout(
    title={'text':'<b>Crime frequency in Toronto</b>',
                       'x':0.5,'xanchor':'center','font':dict(size=20,color='black')},
    xaxis_tickfont_size=14
)

fig.update_traces(marker=dict(line=dict(color='#000000', width=2.5)))

# Update xaxis properties
fig.update_xaxes(title_text="Crime Type",  titlefont_size=17, tickfont_size=14)

# Update yaxis properties
fig.update_yaxes(title_text="No. of Incidents", titlefont_size=17, tickfont_size=14)

fig.show()

Observation:

There are high number of Assaults, Break & Enters, Drug Arrests, Robberies, and Vehicle Thefts cases in the Toronto city. This makes it more essential to consider these cases in our clustering features as they have high impact in making a decision to chose a right neighbourhood.

For this business case, we will be removing Fire related crime cases from the dataset to make sure that clustering algorithms gives more consideration to cases like Assaults, Break & Enters etc.

In [15]:
#droping Fire related crimes and Murders
df.drop(columns=['Arsons', 'Fire Medical Calls','Fire Vehicle Incidents','Fires & Fire Alarms','Total Major Crime Incidents',
                'Murders'],
       inplace=True)
Fetch the geolocation of Toronto, Ontario, Canada and plot the neighbourhoods on a map
In [16]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="Toronto_explorer")

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario, Canada are {}, {}.'.format(latitude, longitude))
The geograpical coordinate of Toronto, Ontario, Canada are 43.6534817, -79.3839347.
In [17]:
#Initialize map
intializeMap = folium.Figure(height=400)

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10).add_to(intializeMap)

#Initialize the cluster
mc = folium.plugins.MarkerCluster()

# add markers to map
for lat, lng, neighbourhood in zip(df['LATITUDE'], df['LONGITUDE'], 
                                           df['Neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    mc.add_child(folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False)).add_to(map_toronto)
    
map_toronto
Out[17]:
Explore the Home Price range of neighbourhoods in Toronto
In [18]:
plt.hist(x=df['Home Prices'], density=False, bins=10)  # `density=False` would make counts
plt.title(label="Distribution of Home prices")
plt.ylabel('Number of Neighbourhoods')
_ = plt.xlabel('Home price range')
In [19]:
print(style.BOLD+style.UNDERLINE+"Observation:\n"+style.END)

print("1. There are",
      len(df[(df['Home Prices']>300000)&(df['Home Prices']<600000)]['Home Prices']),
      "neighbouroods out of",
      df.shape[0],
      "that have a home price range between $300K to $600K.")

print("\n2. Home price data is right skewed.")
Observation:

1. There are 91 neighbouroods out of 140 that have a home price range between $300K to $600K.

2. Home price data is right skewed.
In [20]:
df.columns
Out[20]:
Index(['Neighbourhood', 'Neighbourhood_Id', 'LONGITUDE', 'LATITUDE',
       'Home Prices', 'Child Care Spaces', 'Local Employment', 'Assaults',
       'Break & Enters', 'Drug Arrests', 'Hazardous Incidents', 'Robberies',
       'Sexual Assaults', 'Thefts', 'Vehicle Thefts'],
      dtype='object')
Visualize range of data for each numeric variable
In [21]:
fig = plt.figure(figsize = (8,8))
ax = fig.gca()
df.iloc[:,4:].hist(ax=ax)
plt.tight_layout()
print(style.BOLD+style.UNDERLINE+"Raw Data:\n"+style.END)
plt.show()
Raw Data:

Observation:

  1. The data for a lot of variables is right skewed. It needs to be further investigated as we will be using K means for clustering which would not perform well on skewed data.



Section 3

Data Preparation

In this section, we will use statistical methods to view the skewness in the dataset and later, attempt to reduce this skewness and see if the data can be transformed to have a normal distribution.

Investigate the skewness in the data
In [22]:
#Latitude and Longitude are sliced (i.e. [-2]) from the result.
df.skew().sort_values(ascending=False)[:-2]
Out[22]:
Local Employment       6.556823
Vehicle Thefts         4.499711
Thefts                 3.469959
Drug Arrests           3.179012
Assaults               2.459699
Home Prices            1.995973
Sexual Assaults        1.774144
Robberies              1.484055
Break & Enters         1.060041
Hazardous Incidents    0.922938
Child Care Spaces      0.886435
dtype: float64

Observation:

Most of the variables have positive skew values. By comparing the skewness result and histograms of the variables it is confirmed that skewness is present in few variables.

Transform data using Scikit Learn's 'Yeo-johnson' method

As the data have a range of values and not all values are positive, we will be using sklearn's power transform with 'yeo-johnson' method to transform the data.

In [23]:
col=list(df.columns[4:])
df_transformed=df[col]
df_transformed = power_transform(df_transformed, method='yeo-johnson')
In [24]:
#Convert the list into a dataframe
df_transformed = pd.DataFrame(df_transformed,columns=col)
df_transformed
Out[24]:
Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests Hazardous Incidents Robberies Sexual Assaults Thefts Vehicle Thefts
0 0.0 -0.353080 0.204700 -0.332895 -0.221035 -0.479007 -0.017411 -0.208738 -0.055170 0.071305 -0.425547
1 0.0 -0.996885 0.904450 -0.698544 -0.161500 0.571788 0.338562 0.264798 -0.483890 -1.378488 -0.518832
2 0.0 -1.577388 0.575645 -1.895902 -0.103693 -2.739974 -0.702236 -2.427222 -1.102324 0.277959 -1.290530
3 0.0 0.522160 1.828062 1.212936 1.536830 0.863702 1.326730 1.827438 1.734147 1.483203 2.095205
4 0.0 -0.381914 1.457913 0.597394 0.033913 0.941827 0.553554 1.528696 -0.253749 1.311555 2.215367
... ... ... ... ... ... ... ... ... ... ... ...
135 0.0 -0.381914 -0.818310 0.897971 0.307187 0.691966 0.158214 1.068940 0.485170 0.616596 0.139952
136 0.0 1.373626 1.736016 1.592017 1.735352 2.414216 0.821673 1.273076 0.780618 1.311555 0.436553
137 0.0 0.223788 -0.374628 0.186930 -0.191044 0.668812 0.372978 0.223639 0.610084 0.616596 1.054221
138 0.0 0.056835 -0.558644 -1.501681 -1.386467 -1.240544 -1.097729 0.137790 -2.309770 -0.848077 -0.425547
139 0.0 0.188798 0.129135 -0.007974 0.976595 0.352178 0.389993 1.251635 -0.614590 0.457621 0.021530

140 rows × 11 columns

In [25]:
fig = plt.figure(figsize = (8,8))
ax = fig.gca()
# df_transformed.hist(ax=ax)
df_transformed.iloc[:,:-2].hist(ax=ax)
plt.tight_layout()
print(style.BOLD+style.UNDERLINE+"Data post application of yeo-johnson method:\n"+style.END)
plt.show()
Data post application of yeo-johnson method:

Observation:

  1. The transformation worked quite well on most of the variables.
  2. Home prices needs to be processed using some other method.

Home prices have all the values as positive. Let's apply Box cox and see if it can help us get a normal distribution.

In [26]:
#Transforming the Home Prices for normal distribution using boxcox
# df['Home Prices'].skew()
new_homePrice = stats.boxcox(df['Home Prices'])[0]
print("Old Skewscore: ", df['Home Prices'].skew(),
      "\nSkewscore post Boxcox:", pd.Series(new_homePrice).skew())
Old Skewscore:  1.9959733999305982 
Skewscore post Boxcox: 0.00897670248861421
In [27]:
#Result after boxcox
plt.hist(x=new_homePrice, density=False, bins=10)  # `density=False` would make counts
plt.title(label="Distribution of Home prices")
plt.ylabel('Number of Neighbourhoods')
plt.xlabel('Home price range')
Out[27]:
Text(0.5, 0, 'Home price range')

Observation:

Home price data now exibits fairly normal distribution.

In [28]:
#Replace the Boxcox transformation of Home price data with the Home price data transformed using yeo-johnson method
df_transformed.drop(columns=['Home Prices'], inplace=True)
df_transformed.insert(0,'Home Prices', new_homePrice)
df_transformed.insert(0,'Neighbourhood', df['Neighbourhood'])
print(style.BOLD+style.UNDERLINE+"Transformed Data:\n"+style.END)
df_transformed.head()
Transformed Data:

Out[28]:
Neighbourhood Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests Hazardous Incidents Robberies Sexual Assaults Thefts Vehicle Thefts
0 Wychwood 2.039378 -0.353080 0.204700 -0.332895 -0.221035 -0.479007 -0.017411 -0.208738 -0.055170 0.071305 -0.425547
1 Yonge-Eglinton 2.039888 -0.996885 0.904450 -0.698544 -0.161500 0.571788 0.338562 0.264798 -0.483890 -1.378488 -0.518832
2 Yonge-St.Clair 2.039912 -1.577388 0.575645 -1.895902 -0.103693 -2.739974 -0.702236 -2.427222 -1.102324 0.277959 -1.290530
3 York University Heights 2.038384 0.522160 1.828062 1.212936 1.536830 0.863702 1.326730 1.827438 1.734147 1.483203 2.095205
4 Yorkdale-Glen Park 2.038674 -0.381914 1.457913 0.597394 0.033913 0.941827 0.553554 1.528696 -0.253749 1.311555 2.215367



Section 4

4.1 Fetching venues based on FourSquare API

Define Foursqare API credentials and version
In [74]:
CLIENT_ID = '<Enter your Client ID>' # your Foursquare ID
CLIENT_SECRET = '<Enter your Client Secret>' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
Your credentails:
CLIENT_ID: <Enter your Client ID>
CLIENT_SECRET:<Enter your Client Secret>
Fetch the nearby venues and their details based on each neighbourhood
In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    '''
    A function to fetch the venues near a location within a radius of 500 meters.
    names = name of the location
    latitude = latitude of the location
    longitude = longitude of the location
    
    Return value: nearby venues fetched based on foursquare api
    '''
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
Run the above function
In [31]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['LATITUDE'],
                                   longitudes=df['LONGITUDE'])
Verify that new dataframe is having details of each venue
In [32]:
print(toronto_venues.shape)
toronto_venues.head()
(2036, 7)
Out[32]:
Neighbourhood Neighbourhood Latitude Neighbourhood Longitude Venue Venue Latitude Venue Longitude Venue Category
0 Wychwood 43.676919 -79.425515 Wychwood Barns Farmers' Market 43.680010 -79.423849 Farmers Market
1 Wychwood 43.676919 -79.425515 Wychwood Barns 43.680028 -79.423810 Event Space
2 Wychwood 43.676919 -79.425515 Hillcrest Park 43.676012 -79.424787 Park
3 Wychwood 43.676919 -79.425515 Annabelle Pasta Bar 43.675445 -79.423341 Italian Restaurant
4 Yonge-Eglinton 43.704689 -79.403590 North Toronto Memorial Community Centre 43.706098 -79.404337 Gym
View the number of venues fetched per Neighbourhood
In [33]:
temp_ = toronto_venues['Neighbourhood'].value_counts().to_frame()
temp_.columns = ['No. of Venues per Neighbourhood']
temp_
Out[33]:
No. of Venues per Neighbourhood
Church-Yonge Corridor 100
Mount Pleasant West 70
Bay Street Corridor 64
Junction Area 62
Dufferin Grove 59
... ...
Newtonbrook West 1
Princess-Rosethorn 1
Rustic 1
Willowdale East 1
Willowridge-Martingrove-Richview 1

139 rows × 1 columns

View the total unique categories of venues
In [34]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
There are 282 uniques categories.

4.2 Analyze each nighbourhoods in Toronto, Ontario, Canada

Convert categorical variable into dummy variables

In the below steps, one-hot encoding is performed on "Venue Category" column of the dataframe.

In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()
Out[35]:
Neighbourhood African Restaurant American Restaurant Amphitheater Animal Shelter Antique Shop Arcade Argentinian Restaurant Art Gallery Art Museum ... Video Store Vietnamese Restaurant Warehouse Store Wine Bar Wine Shop Wings Joint Women's Store Yoga Studio Zoo Zoo Exhibit
0 Wychwood 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 Wychwood 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 Wychwood 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 Wychwood 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 Yonge-Eglinton 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 283 columns

In [36]:
print("Shape of one-hot encoded dataframe: ", toronto_onehot.shape)
Shape of one-hot encoded dataframe:  (2036, 283)

Group the one-hot encoded dataframe by "Neighbourhood" column and populate the mean. This is also useful to get a rough idea about the frequency of occurence of each category

In [37]:
#Group the dataframe by neighbourhood and compute the mean of the values for each neighbourhood
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.shape
Out[37]:
(139, 283)

Observation:

After grouping the data based on Neighbourhoods, we should have got 140 rows for each neighbourhood. However, 137 rows above explains that foursquare doesn't have any data for three neighbourhoods given in the origional data.

4.3 Treating missing values occurred during data processing by FourSquare API

In the next steps, we will find the neighbourhoods for which foursquare did not have any data and fix the missing values.

In [38]:
# merge toronto_grouped with origional data
toronto_grouped = df.join(toronto_grouped.set_index('Neighbourhood'), on='Neighbourhood')
#Printing the rows for the neighbourhoods for which no data is fetched from FourSquare API
toronto_grouped[toronto_grouped.isna().any(axis=1)]
Out[38]:
Neighbourhood Neighbourhood_Id LONGITUDE LATITUDE Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests ... Video Store Vietnamese Restaurant Warehouse Store Wine Bar Wine Shop Wings Joint Women's Store Yoga Studio Zoo Zoo Exhibit
50 St.Andrew-Windfields 40 -79.379037 43.756246 1363202 124 13023 63 112 5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

1 rows × 297 columns

Observation:

The three neighbourhoods where the values are missing during the data fetch steps in the FourSqure API are given above. We are going the fill these values with the mean of each columns.

In [39]:
toronto_grouped.fillna(toronto_grouped.mean(), inplace=True)
toronto_grouped[toronto_grouped.isna().any(axis=1)]
Out[39]:
Neighbourhood Neighbourhood_Id LONGITUDE LATITUDE Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests ... Video Store Vietnamese Restaurant Warehouse Store Wine Bar Wine Shop Wings Joint Women's Store Yoga Studio Zoo Zoo Exhibit

0 rows × 297 columns

Observation:

0 rows signifies that no missing values are present in the dataframe now.

In [40]:
#List of columns to be removed from the column for further processing
tempCol = list(df.columns[1:])
tempCol
Out[40]:
['Neighbourhood_Id',
 'LONGITUDE',
 'LATITUDE',
 'Home Prices',
 'Child Care Spaces',
 'Local Employment',
 'Assaults',
 'Break & Enters',
 'Drug Arrests',
 'Hazardous Incidents',
 'Robberies',
 'Sexual Assaults',
 'Thefts',
 'Vehicle Thefts']
In [41]:
#drop the above columns
toronto_grouped.drop(columns=tempCol, inplace=True)
In [42]:
print("Shape of one-hot encoded dataframe grouped by the neighbourhood and its mean: ", toronto_grouped.shape)
Shape of one-hot encoded dataframe grouped by the neighbourhood and its mean:  (140, 283)

4.4 Find the top 10 venues in each neighbourhood of Toronto

Create a function to find the top venues by sorting them in descending order.

In [43]:
def return_most_common_venues(row, num_top_venues):
    '''
    A function to find the top venues by sorting them in descending order.
    
    row: the dataframe having Venue Category mean of frequencies per column.
    num_top_venues: number of required top venues
    
    Return type:A dataframe row having all the venues and the corresponding mean frequency arranged in descending order.
    '''
    #ignore the 1st column as it is the label neighbourhood
    row_categories = row.iloc[1:]
    #sort the 
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create column names for the dataframe i.e. top 10 venues and call the above function on each neighbourhood of Toronto

In [44]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    #Send the ith(ind) row of the dataframe and all columns to the function on each iteration
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

print(style.BOLD+style.UNDERLINE+"Neighbourhoods and their top 10 common venues:\n"+style.END)    
neighbourhoods_venues_sorted.head()
Neighbourhoods and their top 10 common venues:

Out[44]:
Neighbourhood 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
0 Wychwood Farmers Market Park Italian Restaurant Event Space Farm Electronics Store Elementary School Ethiopian Restaurant Falafel Restaurant Zoo Exhibit
1 Yonge-Eglinton Coffee Shop Fast Food Restaurant Restaurant Gym Shopping Mall Buffet Gas Station Italian Restaurant Japanese Restaurant Persian Restaurant
2 Yonge-St.Clair Coffee Shop Italian Restaurant Restaurant Sushi Restaurant Sandwich Place Café Bank Pub Gym Grocery Store
3 York University Heights Bank Massage Studio Caribbean Restaurant Japanese Restaurant Coffee Shop Furniture / Home Store Fast Food Restaurant Pizza Place Bar Filipino Restaurant
4 Yorkdale-Glen Park Restaurant Furniture / Home Store Fast Food Restaurant Coffee Shop Bookstore Bank Bowling Alley Italian Restaurant Greek Restaurant Grocery Store
Create a final dataframe to which clusters will be added and for post cluster formation Analysis
In [45]:
# merge neighbourhoods_venues_sorted with origional data
df_Final = df.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

print(style.BOLD+style.UNDERLINE+"The final dataframe having top 10 venues and all other details:\n"+style.END) 
df_Final.head()
The final dataframe having top 10 venues and all other details:

Out[45]:
Neighbourhood Neighbourhood_Id LONGITUDE LATITUDE Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests ... 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
0 Wychwood 94 -79.425515 43.676919 656868 84 5143 91 58 15 ... Farmers Market Park Italian Restaurant Event Space Farm Electronics Store Elementary School Ethiopian Restaurant Falafel Restaurant Zoo Exhibit
1 Yonge-Eglinton 100 -79.403590 43.704689 975449 45 11746 69 60 43 ... Coffee Shop Fast Food Restaurant Restaurant Gym Shopping Mall Buffet Gas Station Italian Restaurant Japanese Restaurant Persian Restaurant
2 Yonge-St.Clair 97 -79.397871 43.687859 995616 20 7858 27 62 0 ... Coffee Shop Italian Restaurant Restaurant Sushi Restaurant Sandwich Place Café Bank Pub Gym Grocery Store
3 York University Heights 27 -79.488883 43.765736 359372 156 42885 282 150 56 ... Bank Massage Studio Caribbean Restaurant Japanese Restaurant Coffee Shop Furniture / Home Store Fast Food Restaurant Pizza Place Bar Filipino Restaurant
4 Yorkdale-Glen Park 31 -79.457108 43.714672 421045 82 24685 181 67 60 ... Restaurant Furniture / Home Store Fast Food Restaurant Coffee Shop Bookstore Bank Bowling Alley Italian Restaurant Greek Restaurant Grocery Store

5 rows × 25 columns

3.3 Cluster the Neighbourhoods

For K means, selection of K is an important factor. For this project, Elbow method and Silhouette score is used to decide the optimum value of k.

Merge the transformed numerical crime, home prices and venue details data for K means clustering.
In [46]:
# merge one hot encoded, standardized venue data toronto_grouped with crime and
# home prices transformed data (df_transformed)
toronto_grouped_clustering = df_transformed.join(toronto_grouped.set_index('Neighbourhood'), on='Neighbourhood')

#drop Neighbourhood column from the dataframe as it is having categorical value and holds no significance for clustering
toronto_grouped_clustering.drop(columns=['Neighbourhood'], inplace=True)
In [47]:
#Create two empty list and populate the value of inertia and silhouette score in them
distortions = []
silhouette = []
K = range(2,10)

for k in K:
    kmeanModel = KMeans(n_clusters= k, init='k-means++', random_state=0)
    kmeanModel.fit(toronto_grouped_clustering)
    #Elbow method
    distortions.append(kmeanModel.inertia_)
    #Silhouette score
    silhouette.append(silhouette_score(toronto_grouped_clustering, kmeanModel.labels_, metric = 'euclidean'))
In [48]:
#plot graphs of Elbow method and Silhouette score
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
axes[0].plot(K, distortions, marker='o', markersize=10)
axes[0].set(xlabel='K', ylabel='Distortion')
axes[0].set_title('Elbow Method showing the optimal k')

silhouetteTemp = []
axes[1].plot(range(3,9), silhouette[1:-1], marker='o', markersize=10)
axes[1].set(xlabel='K', ylabel='Silhouette score')
axes[1].set_title('Silhouette score showing the optimal k')


fig.suptitle('Finding optimum K value for K-means by Elbow method and Silhouette score', fontsize=16, y=1.05)
fig.subplots_adjust(wspace = 0.5)
Observations based on above graphs:
  1. Using Elbow method, we can see that local optima can be found at K=3 or K=4.
  2. Using Silhouette score, we got more confidence to select K=3 as the Silhouette score is highest at that point.

Hence, we will make 3 clusters in this process.

Running K-means for optimum value of K i.e. 3
In [49]:
# set number of clusters
kclusters = 3

#drop Neighbourhood column from the dataframe as it is having categorical value and holds no significance for clustering
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_
Out[49]:
array([2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1])
In [50]:
# add clustering labels
df_Final.insert(0, 'Cluster Labels', kmeans.labels_)
df_Final.head()
Out[50]:
Cluster Labels Neighbourhood Neighbourhood_Id LONGITUDE LATITUDE Home Prices Child Care Spaces Local Employment Assaults Break & Enters ... 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
0 2 Wychwood 94 -79.425515 43.676919 656868 84 5143 91 58 ... Farmers Market Park Italian Restaurant Event Space Farm Electronics Store Elementary School Ethiopian Restaurant Falafel Restaurant Zoo Exhibit
1 1 Yonge-Eglinton 100 -79.403590 43.704689 975449 45 11746 69 60 ... Coffee Shop Fast Food Restaurant Restaurant Gym Shopping Mall Buffet Gas Station Italian Restaurant Japanese Restaurant Persian Restaurant
2 1 Yonge-St.Clair 97 -79.397871 43.687859 995616 20 7858 27 62 ... Coffee Shop Italian Restaurant Restaurant Sushi Restaurant Sandwich Place Café Bank Pub Gym Grocery Store
3 1 York University Heights 27 -79.488883 43.765736 359372 156 42885 282 150 ... Bank Massage Studio Caribbean Restaurant Japanese Restaurant Coffee Shop Furniture / Home Store Fast Food Restaurant Pizza Place Bar Filipino Restaurant
4 1 Yorkdale-Glen Park 31 -79.457108 43.714672 421045 82 24685 181 67 ... Restaurant Furniture / Home Store Fast Food Restaurant Coffee Shop Bookstore Bank Bowling Alley Italian Restaurant Greek Restaurant Grocery Store

5 rows × 26 columns

In [51]:
#Initialize map
intializeMap = folium.Figure(height=400)

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10).add_to(intializeMap)

#convert the cluster from float to int
df_Final.dropna(inplace=True)
df_Final = df_Final.astype({"Cluster Labels": int})

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_Final['LATITUDE'], 
                                  df_Final['LONGITUDE'], 
                                  df_Final['Neighbourhood'], 
                                  df_Final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto)

map_toronto
Out[51]:
In [52]:
#Keeping only the required features in the dataframe for cluster analysis
df_Final.drop(columns=['Neighbourhood_Id','LATITUDE','LONGITUDE'], inplace=True)

3.4 Examine Clusters

In [53]:
#set the pandas option to show all the columns in the dataframe
pd.set_option('display.max_columns', None)

Cluster 1

In [54]:
df_Final[df_Final['Cluster Labels'] == 0]
Out[54]:
Cluster Labels Neighbourhood Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests Hazardous Incidents Robberies Sexual Assaults Thefts Vehicle Thefts 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
25 0 Newtonbrook West 557468 98 9091 140 127 19 108 45 17 8 36 Photography Studio Eastern European Restaurant Electronics Store Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farm Farmers Market Zoo Exhibit
Observations on Cluster 1:
  1. This cluster is having less number of neighbourhoods as compared to other clusters.
  2. The cluster is having less restaurants. It is also having hardware shops or photography studio as most common venue.
  3. Home price in this cluster is ~550K.
  4. Medium level of Local Employment opportunties.
  5. The crime rate is quite high in this cluster.

Cluster 2

In [56]:
df_Final[df_Final['Cluster Labels'] == 1]
Out[56]:
Cluster Labels Neighbourhood Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests Hazardous Incidents Robberies Sexual Assaults Thefts Vehicle Thefts 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
1 1 Yonge-Eglinton 975449 45 11746 69 60 43 115 33 9 1 14 Coffee Shop Fast Food Restaurant Restaurant Gym Shopping Mall Buffet Gas Station Italian Restaurant Japanese Restaurant Persian Restaurant
2 1 Yonge-St.Clair 995616 20 7858 27 62 0 67 3 5 6 8 Coffee Shop Italian Restaurant Restaurant Sushi Restaurant Sandwich Place Café Bank Pub Gym Grocery Store
3 1 York University Heights 359372 156 42885 282 150 56 186 99 49 16 122 Bank Massage Studio Caribbean Restaurant Japanese Restaurant Coffee Shop Furniture / Home Store Fast Food Restaurant Pizza Place Bar Filipino Restaurant
4 1 Yorkdale-Glen Park 421045 82 24685 181 67 60 128 81 11 14 137 Restaurant Furniture / Home Store Fast Food Restaurant Coffee Shop Bookstore Bank Bowling Alley Italian Restaurant Greek Restaurant Grocery Store
7 1 Lawrence Park North 980831 212 3624 33 56 2 138 13 4 6 27 Sushi Restaurant Italian Restaurant Bakery Pub Bank Fast Food Restaurant Sandwich Place Coffee Shop Asian Restaurant Bubble Tea Shop
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
134 1 Keelesdale-Eglinton West 373029 25 1644 125 58 24 58 39 15 1 21 Pizza Place Bakery Sandwich Place Wine Shop Fast Food Restaurant Latin American Restaurant Thrift / Vintage Store Event Space Egyptian Restaurant Electronics Store
135 1 Kennedy Park 293600 82 1824 225 78 48 105 59 20 8 23 Convenience Store Playground Discount Store Coffee Shop Financial or Legal Service Fish & Chips Shop Filipino Restaurant Field Fast Food Restaurant Eastern European Restaurant
136 1 Kensington-Chinatown 477989 247 37205 369 166 197 146 68 25 14 29 Café Mexican Restaurant Coffee Shop Gaming Cafe Vietnamese Restaurant Bakery Vegetarian / Vegan Restaurant Tea Room Cocktail Bar Jazz Club
137 1 Kingsview Village-The Westway 292861 129 2799 134 59 47 117 32 22 8 48 Coffee Shop Grocery Store Pizza Place Bus Line Farmers Market Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farm
139 1 L'Amoreaux 355438 126 4734 116 112 35 118 67 8 7 21 Chinese Restaurant Pizza Place Bank Discount Store Gym Pool Electronics Store Thrift / Vintage Store Bakery Camera Store Nail Salon

114 rows × 23 columns

In [66]:
#set the float format of pandas to display long values with 2 decimal places
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df_Final[df_Final['Cluster Labels'] == 1].describe()
Out[66]:
Cluster Labels Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests Hazardous Incidents Robberies Sexual Assaults Thefts Vehicle Thefts
count 114.00 114.00 114.00 114.00 114.00 114.00 114.00 114.00 114.00 114.00 114.00 114.00
mean 1.00 536885.80 122.53 10064.26 153.53 76.33 38.14 109.79 34.61 16.96 6.72 29.27
std 0.00 255664.57 78.78 20963.05 124.15 43.05 40.90 55.07 26.01 12.71 7.29 35.53
min 1.00 204104.00 0.00 550.00 16.00 12.00 0.00 26.00 3.00 1.00 0.00 3.00
25% 1.00 377010.00 60.00 1976.00 71.25 44.25 14.00 64.50 16.00 8.00 2.25 13.00
50% 1.00 490143.00 109.50 3977.00 125.00 63.50 28.00 99.50 28.50 14.00 4.50 20.50
75% 1.00 576228.25 171.75 10687.00 182.75 104.25 47.75 137.25 43.50 21.75 8.00 31.75
max 1.00 1849084.00 367.00 185891.00 892.00 219.00 302.00 272.00 124.00 68.00 54.00 288.00
Observations on Cluster 2:
  1. The neighbourhoods in this cluster are having a lot of nearby restaurants, coffee shops, pubs and bars.
  2. The average home price in this area is in the range of 250K to 600K with some exceptions.
  3. The crime rate is Medium to High.
  4. In few neighbourhoods, there are no or less child care spaces.

Cluster 3

In [68]:
df_Final[df_Final['Cluster Labels'] == 2].head(10)
Out[68]:
Cluster Labels Neighbourhood Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests Hazardous Incidents Robberies Sexual Assaults Thefts Vehicle Thefts 1st Most Common Venue 2nd Most Common Venue 3rd Most Common Venue 4th Most Common Venue 5th Most Common Venue 6th Most Common Venue 7th Most Common Venue 8th Most Common Venue 9th Most Common Venue 10th Most Common Venue
0 2 Wychwood 656868 84 5143 91 58 15 96 23 13 5 15 Farmers Market Park Italian Restaurant Event Space Farm Electronics Store Elementary School Ethiopian Restaurant Falafel Restaurant Zoo Exhibit
5 2 Lambton Baby Point 806452 19 438 28 20 8 49 12 2 1 8 River Playground Park Garden Zoo Exhibit Electronics Store Elementary School Ethiopian Restaurant Event Space Falafel Restaurant
6 2 Lansing-Westgate 646460 75 12254 70 46 13 97 10 19 4 24 Health & Beauty Service Park IT Services Zoo Exhibit Electronics Store Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farm
15 2 Milliken 387879 220 16901 79 124 5 43 32 3 15 67 Park Sandwich Place Zoo Exhibit Eastern European Restaurant Electronics Store Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farm
20 2 Mount Olive-Silverstone-Jamestown 251119 60 3244 316 61 90 176 78 75 7 62 Convenience Store Park Coffee Shop Fish & Chips Shop Financial or Legal Service Filipino Restaurant Field Fast Food Restaurant Farmers Market Eastern European Restaurant
27 2 North Riverdale 818592 233 4033 227 79 27 75 40 19 0 10 Park Café Pool Dog Run Egyptian Restaurant Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farm
34 2 Parkwoods-Donalda 553698 441 2802 177 106 34 151 51 10 4 41 Food & Drink Shop Park Spa Farm Electronics Store Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farmers Market
43 2 Rosedale-Moore Park 1265389 162 19160 85 82 9 222 20 12 9 20 Candy Store Playground Park Tennis Court Falafel Restaurant Egyptian Restaurant Electronics Store Elementary School Ethiopian Restaurant Event Space
54 2 Taylor-Massey 254151 60 1399 187 51 28 89 28 19 7 22 Baseball Field Park Theater Zoo Exhibit Farm Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farmers Market
62 2 West Hill 308229 153 4500 387 102 87 142 71 52 3 46 Park Construction & Landscaping Gym / Fitness Center Farm Electronics Store Elementary School Ethiopian Restaurant Event Space Falafel Restaurant Farmers Market
In [58]:
df_Final[df_Final['Cluster Labels'] == 2].describe()
Out[58]:
Cluster Labels Home Prices Child Care Spaces Local Employment Assaults Break & Enters Drug Arrests Hazardous Incidents Robberies Sexual Assaults Thefts Vehicle Thefts
count 25.00 25.00 25.00 25.00 25.00 25.00 25.00 25.00 25.00 25.00 25.00 25.00
mean 2.00 599385.12 135.60 6435.68 137.80 68.24 29.96 101.12 33.28 20.68 5.32 32.08
std 0.00 322259.09 95.78 6175.73 112.96 37.37 36.10 56.50 28.80 21.65 3.47 27.49
min 2.00 251119.00 19.00 438.00 19.00 20.00 0.00 28.00 4.00 1.00 0.00 3.00
25% 2.00 372380.00 60.00 2207.00 56.00 46.00 8.00 54.00 12.00 8.00 3.00 15.00
50% 2.00 541247.00 115.00 4033.00 91.00 60.00 15.00 86.00 27.00 13.00 5.00 24.00
75% 2.00 784397.00 180.00 8323.00 187.00 84.00 33.00 129.00 40.00 19.00 7.00 42.00
max 2.00 1585984.00 441.00 20150.00 402.00 187.00 126.00 233.00 120.00 75.00 15.00 128.00
Observations on Cluster 3:
  1. The neighbourhoods in this cluster are having a lot of nearby outdoor places like park, trail, playground, dog run etc.
  2. It can be clearly noticed that the neighbourhood in this cluster are having a mix of outdoor places like playgrounds or parks, restaurants, pubs/bars, gym and playgrounds.
  3. The average home price in this area is in the range of 250K to 800K with some exceptions.
  4. None of neighbourhoods in this cluster are present which doesn't have Child Care spaces.
  5. Local employment rate is Medium to High.
  6. Crime rate is Low to High.
In [59]:
#Reset all pandas option back to default
pd.reset_option('display.max_rows', 'display.max_columns')
pd.reset_option('display.float_format')
In [73]:
values = [['<br>Cluster 1', '<br>Cluster 2','<br>Cluster 3'],
  [
  """
  <b>House Price Range:</b> ~550K<br>
  <b>Childcare Spaces:</b> Low to Medium<br>
  <b>Crime Rate:</b> High. Common cases of Assaults, Break & Enter, and Hazardous Incidents.<br>
  <b>Nearby Venues:</b> Less restaurants, photography studio and Farms, Farmers market<br>
  <b>Local Employment:</b> Medium
  """,
  """
  <b>House Price Range:</b> 250K to 600K<br>
  <b>Childcare Spaces:</b> Medium to High. Some neighbourhoods don't have childcare spaces<br>
  <b>Crime Rate:</b> Medium to High. Common cases of Assaults, Break & Enter.<br>
  <b>Nearby Venues:</b> Restaunrants, coffee shops, pubs and bars<br>
  <b>Local Employment:</b> Medium to High 
  """,
  """
  <b>House Price Range:</b> 250K to 800K<br>
  <b>Childcare Spaces:</b> Medium to High (Each nighbourhood has a childcare space)<br>
  <b>Crime Rate:</b> Low to High. Common cases of Assaults, Hazardous Incidents, and Thefts.<br>
  <b>Nearby Venues:</b> Playgrounds or parks, restaurants, pubs/bars, gym and clothing store<br>
  <b>Local Employment:</b> Medium to High
  """
  ],
  [df_Final[df_Final['Cluster Labels'] == 0].shape[0],
   df_Final[df_Final['Cluster Labels'] == 1].shape[0],
   df_Final[df_Final['Cluster Labels'] == 2].shape[0]
  ]
         ]


fig = go.Figure(data=[go.Table(
  columnorder = [1,2,3],
  columnwidth = [80,440,110],
  header = dict(
    values = [    ['<b>Cluster/<br>Segments</b>'],
                  ['<b>Cluster/Segment<br>       Details</b>'],
                  ['<b>Number of Neighbourhoods</b>']],
    line_color='darkslategray',
    fill_color='royalblue',
    align=['left','center'],
    font=dict(color='white', size=14),
    height=40
  ),
  cells=dict(
    values=values,
    line_color='darkslategray',
    fill=dict(color=['paleturquoise', 'white']),
    align=['left','left','center'],
    font_size=12,
    height=30)
    )
])
fig.show()
In [ ]: